library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.0
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidytable)
## Warning: tidytable was loaded after dplyr.
## This can lead to most dplyr functions being overwritten by tidytable functions.
## Warning: tidytable was loaded after tidyr.
## This can lead to most tidyr functions being overwritten by tidytable functions.
## 
## Attaching package: 'tidytable'
## The following objects are masked from 'package:dplyr':
## 
##     across, add_count, add_tally, anti_join, arrange, between,
##     bind_cols, bind_rows, c_across, case_match, case_when, coalesce,
##     consecutive_id, count, cross_join, cume_dist, cur_column, cur_data,
##     cur_group_id, cur_group_rows, dense_rank, desc, distinct, filter,
##     first, full_join, group_by, group_cols, group_split, group_vars,
##     if_all, if_any, if_else, inner_join, is_grouped_df, lag, last,
##     lead, left_join, min_rank, mutate, n, n_distinct, na_if, nest_by,
##     nest_join, nth, percent_rank, pick, pull, recode, relocate, rename,
##     rename_with, right_join, row_number, rowwise, select, semi_join,
##     slice, slice_head, slice_max, slice_min, slice_sample, slice_tail,
##     summarise, summarize, tally, top_n, transmute, tribble, ungroup
## The following objects are masked from 'package:purrr':
## 
##     map, map_chr, map_dbl, map_df, map_dfc, map_dfr, map_int, map_lgl,
##     map_vec, map2, map2_chr, map2_dbl, map2_df, map2_dfc, map2_dfr,
##     map2_int, map2_lgl, map2_vec, pmap, pmap_chr, pmap_dbl, pmap_df,
##     pmap_dfc, pmap_dfr, pmap_int, pmap_lgl, pmap_vec, walk
## The following objects are masked from 'package:tidyr':
## 
##     complete, crossing, drop_na, expand, expand_grid, extract, fill,
##     nest, nesting, pivot_longer, pivot_wider, replace_na, separate,
##     separate_longer_delim, separate_rows, separate_wider_delim,
##     separate_wider_regex, tribble, uncount, unite, unnest,
##     unnest_longer, unnest_wider
## The following objects are masked from 'package:tibble':
## 
##     enframe, tribble
## The following objects are masked from 'package:stats':
## 
##     dt, filter, lag
## The following object is masked from 'package:base':
## 
##     %in%
# download.file("https://raw.githubusercontent.com/JovianML/opendatasets/master/data/stackoverflow-developer-survey-2020/survey_results_schema.csv", "survey_results_schema.csv")
# download.file("https://raw.githubusercontent.com/JovianML/opendatasets/master/data/stackoverflow-developer-survey-2020/survey_results_public.csv", "survey_results_public.csv")
# download.file("https://raw.githubusercontent.com/JovianML/opendatasets/master/data/stackoverflow-developer-survey-2020/README.txt", "README.txt")
survey_raw_df <- read_csv("survey_results_public.csv")
## Rows: 64461 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (56): MainBranch, Hobbyist, Age1stCode, CompFreq, Country, CurrencyDesc,...
## dbl  (5): Respondent, Age, CompTotal, ConvertedComp, WorkWeekHrs
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survey_raw_df
schema_raw_df <- read_csv("survey_results_schema.csv")
## Rows: 61 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Column, QuestionText
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
schema_raw_df
str(schema_raw_df)
## spc_tbl_ [61 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Column      : chr [1:61] "Respondent" "MainBranch" "Hobbyist" "Age" ...
##  $ QuestionText: chr [1:61] "Randomized respondent ID number (not in order of survey response time)" "Which of the following options best describes you today? Here, by \"developer\" we mean \"someone who writes code.\"" "Do you code as a hobby?" "What is your age (in years)? If you prefer not to answer, you may leave this question blank." ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Column = col_character(),
##   ..   QuestionText = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
schema_raw_df[1]
ind <- schema_raw_df$Column == "YearsCodePro"
schema_raw_df$QuestionText[ind]
## [1] "NOT including education, how many years have you coded professionally (as a part of your work)?"
selected_columns <- c("Country", "Age", "Gender", "EdLevel", "UndergradMajor",
                      "Hobbyist", "Age1stCode", "YearsCode", "YearsCodePro",
                      "LanguageWorkedWith", "LanguageDesireNextYear", 
                      "NEWLearn", "NEWStuck", "Employment", "DevType", 
                      "WorkWeekHrs", "JobSat", "JobFactors",
                      "NEWOvertime", "NEWEdImpt")
length(selected_columns)
## [1] 20
survey_df <- survey_raw_df %>%
  select(all_of(selected_columns))
ind <- which(schema_raw_df$Column %in% selected_columns)
schema <- schema_raw_df[ind,]
schema
str(survey_df)
## Classes 'tidytable', 'data.table' and 'data.frame':  64461 obs. of  20 variables:
##  $ Country               : chr  "Germany" "United Kingdom" "Russian Federation" "Albania" ...
##  $ Age                   : num  NA NA NA 25 31 NA NA 36 30 22 ...
##  $ Gender                : chr  "Man" NA NA "Man" ...
##  $ EdLevel               : chr  "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)" "Bachelor’s degree (B.A., B.S., B.Eng., etc.)" NA "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)" ...
##  $ UndergradMajor        : chr  "Computer science, computer engineering, or software engineering" "Computer science, computer engineering, or software engineering" NA "Computer science, computer engineering, or software engineering" ...
##  $ Hobbyist              : chr  "Yes" "No" "Yes" "Yes" ...
##  $ Age1stCode            : chr  "13" "19" "15" "18" ...
##  $ YearsCode             : chr  "36" "7" "4" "7" ...
##  $ YearsCodePro          : chr  "27" "4" NA "4" ...
##  $ LanguageWorkedWith    : chr  "C#;HTML/CSS;JavaScript" "JavaScript;Swift" "Objective-C;Python;Swift" NA ...
##  $ LanguageDesireNextYear: chr  "C#;HTML/CSS;JavaScript" "Python;Swift" "Objective-C;Python;Swift" NA ...
##  $ NEWLearn              : chr  "Once a year" "Once a year" "Once a decade" "Once a year" ...
##  $ NEWStuck              : chr  "Visit Stack Overflow;Go for a walk or other physical activity;Do other work and come back later" "Visit Stack Overflow;Go for a walk or other physical activity" NA NA ...
##  $ Employment            : chr  "Independent contractor, freelancer, or self-employed" "Employed full-time" NA NA ...
##  $ DevType               : chr  "Developer, desktop or enterprise applications;Developer, full-stack" "Developer, full-stack;Developer, mobile" NA NA ...
##  $ WorkWeekHrs           : num  50 NA NA 40 NA NA NA 39 50 36 ...
##  $ JobSat                : chr  "Slightly satisfied" "Very dissatisfied" NA "Slightly dissatisfied" ...
##  $ JobFactors            : chr  "Languages, frameworks, and other technologies I’d be working with;Remote work options;Opportunities for profess"| __truncated__ NA NA "Flex time or a flexible schedule;Office environment or company culture;Opportunities for professional development" ...
##  $ NEWOvertime           : chr  "Often: 1-2 days per week or more" NA NA "Occasionally: 1-2 days per quarter but less than monthly" ...
##  $ NEWEdImpt             : chr  "Fairly important" "Fairly important" NA "Not at all important/not necessary" ...
##  - attr(*, ".internal.selfref")=<externalptr>
survey_df |>
  mutate(
    Age1stCode = as.numeric(Age1stCode),
    YearsCode = as.numeric(YearsCode),
    YearsCodePro = as.numeric(YearsCodePro)
  ) -> survey_df
## Warning in vctrs::vec_recycle(as.numeric(Age1stCode), .N): NAs introduced by
## coercion
## Warning in vctrs::vec_recycle(as.numeric(YearsCode), .N): NAs introduced by
## coercion
## Warning in vctrs::vec_recycle(as.numeric(YearsCodePro), .N): NAs introduced by
## coercion
str(survey_df)
## Classes 'tidytable', 'data.table' and 'data.frame':  64461 obs. of  20 variables:
##  $ Country               : chr  "Germany" "United Kingdom" "Russian Federation" "Albania" ...
##  $ Age                   : num  NA NA NA 25 31 NA NA 36 30 22 ...
##  $ Gender                : chr  "Man" NA NA "Man" ...
##  $ EdLevel               : chr  "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)" "Bachelor’s degree (B.A., B.S., B.Eng., etc.)" NA "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)" ...
##  $ UndergradMajor        : chr  "Computer science, computer engineering, or software engineering" "Computer science, computer engineering, or software engineering" NA "Computer science, computer engineering, or software engineering" ...
##  $ Hobbyist              : chr  "Yes" "No" "Yes" "Yes" ...
##  $ Age1stCode            : num  13 19 15 18 16 14 18 12 20 14 ...
##  $ YearsCode             : num  36 7 4 7 15 6 6 17 6 8 ...
##  $ YearsCodePro          : num  27 4 NA 4 8 4 4 13 4 4 ...
##  $ LanguageWorkedWith    : chr  "C#;HTML/CSS;JavaScript" "JavaScript;Swift" "Objective-C;Python;Swift" NA ...
##  $ LanguageDesireNextYear: chr  "C#;HTML/CSS;JavaScript" "Python;Swift" "Objective-C;Python;Swift" NA ...
##  $ NEWLearn              : chr  "Once a year" "Once a year" "Once a decade" "Once a year" ...
##  $ NEWStuck              : chr  "Visit Stack Overflow;Go for a walk or other physical activity;Do other work and come back later" "Visit Stack Overflow;Go for a walk or other physical activity" NA NA ...
##  $ Employment            : chr  "Independent contractor, freelancer, or self-employed" "Employed full-time" NA NA ...
##  $ DevType               : chr  "Developer, desktop or enterprise applications;Developer, full-stack" "Developer, full-stack;Developer, mobile" NA NA ...
##  $ WorkWeekHrs           : num  50 NA NA 40 NA NA NA 39 50 36 ...
##  $ JobSat                : chr  "Slightly satisfied" "Very dissatisfied" NA "Slightly dissatisfied" ...
##  $ JobFactors            : chr  "Languages, frameworks, and other technologies I’d be working with;Remote work options;Opportunities for profess"| __truncated__ NA NA "Flex time or a flexible schedule;Office environment or company culture;Opportunities for professional development" ...
##  $ NEWOvertime           : chr  "Often: 1-2 days per week or more" NA NA "Occasionally: 1-2 days per quarter but less than monthly" ...
##  $ NEWEdImpt             : chr  "Fairly important" "Fairly important" NA "Not at all important/not necessary" ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(survey_df)
##    Country               Age            Gender            EdLevel         
##  Length:64461       Min.   :  1.00   Length:64461       Length:64461      
##  Class :character   1st Qu.: 24.00   Class :character   Class :character  
##  Mode  :character   Median : 29.00   Mode  :character   Mode  :character  
##                     Mean   : 30.83                                        
##                     3rd Qu.: 35.00                                        
##                     Max.   :279.00                                        
##                     NA's   :19015                                         
##  UndergradMajor       Hobbyist           Age1stCode      YearsCode    
##  Length:64461       Length:64461       Min.   : 5.00   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:12.00   1st Qu.: 6.00  
##  Mode  :character   Mode  :character   Median :15.00   Median :10.00  
##                                        Mean   :15.48   Mean   :12.78  
##                                        3rd Qu.:18.00   3rd Qu.:17.00  
##                                        Max.   :85.00   Max.   :50.00  
##                                        NA's   :6988    NA's   :7677   
##   YearsCodePro   LanguageWorkedWith LanguageDesireNextYear   NEWLearn        
##  Min.   : 1.00   Length:64461       Length:64461           Length:64461      
##  1st Qu.: 3.00   Class :character   Class :character       Class :character  
##  Median : 6.00   Mode  :character   Mode  :character       Mode  :character  
##  Mean   : 8.87                                                               
##  3rd Qu.:12.00                                                               
##  Max.   :50.00                                                               
##  NA's   :20328                                                               
##    NEWStuck          Employment          DevType           WorkWeekHrs    
##  Length:64461       Length:64461       Length:64461       Min.   :  1.00  
##  Class :character   Class :character   Class :character   1st Qu.: 40.00  
##  Mode  :character   Mode  :character   Mode  :character   Median : 40.00  
##                                                           Mean   : 40.78  
##                                                           3rd Qu.: 44.00  
##                                                           Max.   :475.00  
##                                                           NA's   :23310   
##     JobSat           JobFactors        NEWOvertime         NEWEdImpt        
##  Length:64461       Length:64461       Length:64461       Length:64461      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 
survey_df |>
  summarise(Age = n(), .by = Gender)
survey_df |>
  filter(Age >= 10 | is.na(Age)) |>
  filter(WorkWeekHrs <= 140 | is.na(WorkWeekHrs)) |>
  summarise(Age = n(), .by = Gender)
survey_df %>% 
  mutate(Gender = if_else(grepl(";", Gender), NA, Gender)) -> survey_df
survey_df |>
  summarise(Age = n(), .by = Gender)
survey_df |> sample(10)

Exploratory Analysis and Visualization

Before we ask questions about the survey responses, it would help to understand the respondents’ demographics, i.e., country, age, gender, education level, employment level, etc. It’s essential to explore these variables to understand how representative the survey is of the worldwide programming community. A survey of this scale generally tends to have some selection bias.

Country

Let’s look at the number of countries from which there are responses in the survey and plot the ten countries with the highest number of responses.

ind_country <- schema$Column == "Country"
schema$QuestionText[ind_country]
## [1] "Where do you live?"
country_question <- schema$QuestionText[schema$Column == "Country"]
length(unique(survey_df$Country))
## [1] 184
survey_df |>
  summarise(Count = n(), .by = Country) |>
  top_n(n = 15, wt = Count) |>
  mutate(Country = reorder(Country, Count, decreasing = TRUE)) -> top_countries
top_countries
top_countries |>
  ggplot(aes(x = Country, y = Count, fill = Country)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  ggtitle(country_question) +
  theme(axis.text.x = element_text(angle=75, vjust=1, hjust = 1))

survey_df |>
  ggplot(aes(Age)) +
  geom_histogram(binwidth = 5, colour = "black", fill="purple") +
  scale_x_continuous(name = "Age",
                   breaks = seq(10, 80, 10)) +
  xlim(10, 80) +
  ylab("Number of respondents")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning: Removed 19053 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

It appears that a large percentage of respondents are 20-45 years old. It’s somewhat representative of the programming community in general. Many young people have taken up computer science as their field of study or profession in the last 20 years.

Exercise: You may want to filter out responses by age (or age group) if you’d like to analyze and compare the survey results for different age groups. Create a new column called AgeGroup containing values like Less than 10 years, 10-18 years, 18-30 years, 30-45 years, 45-60 years and Older than 60 years. Then, repeat the analysis in the rest of this notebook for each age group.

survey_df |>
  mutate(AgeGroup = case_when(
    Age < 18 ~ "Younger than 18",
    Age >= 18 & Age <= 25 ~ "18-25 years",
    Age > 25 & Age <= 35 ~ "26-35 years",
    Age > 35 & Age <= 45 ~ "36-45 years",
    Age > 45 & Age <= 55 ~ "46-55 years",
    TRUE ~ "Older than 55"
  )) -> survey_agegroup_df
survey_agegroup_df
survey_df |>
  na.omit() |>
  summarise(Age = n(), .by = Gender)
survey_df |>
  drop_na() |>
  summarise(Age = n(), .by = Gender) |>
  mutate(Percentage = Age / sum(Age)) |>
  mutate(labels = scales::percent(Percentage)) -> survey_df_percent_age
survey_df_percent_age |>
  ggplot(aes(x="", y=Percentage, fill = Gender)) +
    geom_col() +
    coord_polar(theta = "y") +
  geom_text(aes(label = labels), position = position_stack(vjust = 0.5)) +
  theme(axis.text=element_blank(), 
        axis.ticks = element_blank(), 
        panel.grid  = element_blank()) +
  xlab("") +
  ylab("Percentage of responses")